#load libraries
library(tidyverse)
library(quanteda) #for text cleaning
library(igraph) #for creating graphs
library(visNetwork) #for visualizing graphs
library(wordcloud) #for creating wordclouds
#load_functions
source("calculatecoocstats.R") #calculate co-occurrence statistics
source("grapher.R") #create graph
#Wiedemann, Gregor; Niekler, Andreas (2017): Hands-on: A five day text mining course for humanists and social scientists in R. Proceedings of the 1st Workshop on Teaching NLP for Digital Humanities (Teach4DH@GSCL 2017), Berlin.
source("rawcounts.R") #find raw counts of co-occurrences
source("token_filter.R") #filter tokens
#load tokens, get it ready for analysis
load("token.all.RData")
#convert tokens to all lower
token.all <- tokens_tolower(token.all) #convert all tokens to lower
#sample based on min in a decade
token.all = tokens_sample(token.all, size = 22638, replace = FALSE, prob = NULL, by = decade)
#create a token set with only generalized pos info
pos_replace <- function(toks.replace){
toks.replace <- toks.replace %>%
tokens_replace(pattern = c("*/NOUN", "*/VERB", "*/ADJ"), replacement = c("NOUN", "VERB", "ADJ"))
return(toks.replace)
}
token.pos <- pos_replace(token.all)
p_decdat <- data.frame() #initialize data frame
pos = c('verb', 'adj', 'noun') #pos to be analysed
for(j in 0:7){ #for loop to run for each decade
year = 1940 + j*10 #create decade variable
pos_counts <- rawcounts(token_filter("all", year, token.pos)) #find raw co-occurrence counts
male_pos <- pos_counts["male/characters", pos] #filter pos
male_p <- male_pos / sum(male_pos) #find empirical probability
male_pdat <- data.frame(pos = names(male_pos), p = male_p) #organise data frame
male_pdat$gender = "male" #assign gender
#do the same for females
female_pos <- pos_counts["female/characters", pos]
female_p <- female_pos / sum(female_pos)
female_pdat <- data.frame(pos = names(female_pos), p = female_p)
female_pdat$gender = "female"
p_decdat.temp <- rbind(male_pdat, female_pdat) #bind gender data
p_decdat.temp$year <- year #assign year
p_decdat <- rbind(p_decdat, p_decdat.temp) #bind ind. decade with overall
}
## Df Sum Sq Mean Sq F value Pr(>F)
## year 1 0.0001165 1.165e-04 2.296 0.156
## gender 1 0.0000094 9.370e-06 0.185 0.675
## year:gender 1 0.0000066 6.620e-06 0.130 0.724
## Residuals 12 0.0006091 5.076e-05
## Df Sum Sq Mean Sq F value Pr(>F)
## year 1 4.602e-05 4.602e-05 2.372 0.149
## gender 1 4.000e-07 4.000e-07 0.021 0.888
## year:gender 1 2.500e-07 2.500e-07 0.013 0.911
## Residuals 12 2.328e-04 1.940e-05
## Saving 7 x 5 in image
## Df Sum Sq Mean Sq F value Pr(>F)
## year 1 1.608e-05 1.608e-05 2.039 0.179
## gender 1 1.364e-05 1.364e-05 1.729 0.213
## year:gender 1 4.300e-06 4.297e-06 0.545 0.475
## Residuals 12 9.461e-05 7.885e-06
load("token.all.RData")
#convert tokens to all lower
token.all <- tokens_tolower(token.all) #convert all tokens to lower
token.all = tokens_sample(token.all, size = 22638, replace = FALSE, prob = NULL, by = decade)
#token.all <- token_filter2('all', 2000, 2010, token.all)
detect_communities <- function(toks.all, gender = 'male', nn = 10){
toks.all = token_filter2('all', 2000, 2010, toks.all)
toks <- toks.all %>%
tokens_select(pattern = paste(gender, '/characters', sep = ''), selection = 'remove', padding = TRUE, window = 5)
#filter to keep only words that occur at least 10 times
dfm <- toks %>% dfm() %>% dfm_trim(min_termfreq = 10)
filtered = colnames(dfm)
toks <- token.all %>%
tokens_select(pattern = filtered, selection = 'keep', padding = TRUE)
#feature co-occurrence matrix for males
fcmat = fcm(toks, context = c("window"),
count = c("weighted"), #words are weighted within the window
window = 5)
graph = graph_from_adjacency_matrix(fcmat, weighted = TRUE) #create graph from matrix
edgelist <- get.data.frame(graph)
edgelist_m <- as.matrix(edgelist[ ,c("from", "to")])
graph <- graph_from_edgelist(edgelist_m, directed = FALSE)
graph <- set.edge.attribute(graph, "weight", value = edgelist$weight)
graph = simplify(graph, remove.loops = TRUE) #remove self-looping edges
#louvian communities
louvain <- cluster_louvain(graph, weights = E(graph)$weights)#detect communities
graph$community <- louvain$membership
#most important word in each community
communities <- data.frame()
for (i in unique(graph$community)) {
# create subgraphs for each community
subgraph <- induced_subgraph(graph, v = which(graph$community == i))
# get size of each subgraph
size <- igraph::gorder(subgraph)
# get betweenness centrality
btwn <- igraph::betweenness(subgraph)
communities <- communities %>%
dplyr::bind_rows(
data.frame(community = i,
n_characters = size,
most_important = names(which(btwn == max(btwn)))
)
)
}
communities = arrange(communities, desc(n_characters))
top_comm <- communities$community[1:5]
print(communities)
#top ten in each community
top_ten <- data.frame()
n = 0
for (i in top_comm) {
# create subgraphs for each community
subgraph <- induced_subgraph(graph, v = which(graph$community == i))
n = n + 1
# get degree
degree <- igraph::degree(subgraph)
# get top ten degrees
top <- names(head(sort(degree, decreasing = TRUE), nn))
result <- data.frame(community = i, rank = 1:nn, word = top)
top_ten <- top_ten %>%
dplyr::bind_rows(result)
}
print(top_ten)
#write.csv(top_ten, paste(gender, '.csv', sep = ''))
print(paste('modularity =', modularity(louvain)))
#Visualizing the communities
subgraph <- induced_subgraph(graph, v = top_ten$word)
subgraph <- simplify(subgraph)
subgraph$community
nodes = data.frame(word = names(V(subgraph)))
group = rep(1:n, each = nn)
top_ten$group = group
clusters = inner_join(nodes, top_ten)
subgraph$community <- clusters$group
#unique(subgraph$community)
# give our nodes some properties, incl scaling them by degree and coloring them by community
V(subgraph)$size <- 5
V(subgraph)$frame.color <- "white"
V(subgraph)$color <- subgraph$community
#V(male_subgraph)$label <- V(male_subgraph)$name
V(subgraph)$label.cex <- 1.8
# also color edges according to their starting node
#edge.start <- ends(subgraph, es = E(subgraph), names = F)[,1]
#E(subgraph)$color <- V(subgraph)$color[edge.start]
#E(subgraph)$arrow.mode <- 0
#plot by groups
#make clusters first
clust_obj = make_clusters(subgraph, membership = clusters$group)
# weights <- ifelse(crossing(male_clust, male_subgraph), 1, 100)
# layout <- layout_with_kk(male_subgraph, weights=weights)
# plot(male_subgraph, layout=layout)
prettyColors <- c("turquoise4", "azure4", "olivedrab","deeppink4", "blue")
communityColors <- prettyColors[membership(clust_obj)]
edge.weights <- function(community, network, weight.within = 100, weight.between = 1) {
bridges <- crossing(communities = community, graph = network)
weights <- ifelse(test = bridges, yes = weight.between, no = weight.within)
return(weights)
}
E(subgraph)$weight <- edge.weights(clust_obj, subgraph)
layout <- layout_with_fr(subgraph, weights=E(subgraph)$weight)
plot(subgraph, layout=layout, col = communityColors)
}
detect_communities(token.all, 'male', 10)
## community n_characters most_important
## 1 2 539 battle/noun
## 2 5 383 many/adj
## 3 6 356 true/adj
## 4 3 346 starts/verb
## 5 4 89 suicide/noun
## 6 1 87 travel/noun
## community rank word
## 1 2 1 takes/verb
## 2 2 2 car/noun
## 3 2 3 death/noun
## 4 2 4 killed/verb
## 5 2 5 body/noun
## 6 2 6 room/noun
## 7 2 7 dead/adj
## 8 2 8 killing/verb
## 9 2 9 kills/verb
## 10 2 10 using/verb
## 11 5 1 other/adj
## 12 5 2 have/verb
## 13 5 3 take/verb
## 14 5 4 find/verb
## 15 5 5 men/noun
## 16 5 6 go/verb
## 17 5 7 get/verb
## 18 5 8 leave/verb
## 19 5 9 money/noun
## 20 5 10 make/verb
## 21 6 1 female/characters
## 22 6 2 father/noun
## 23 6 3 tells/verb
## 24 6 4 mother/noun
## 25 6 5 has/verb
## 26 6 6 finds/verb
## 27 6 7 family/noun
## 28 6 8 man/noun
## 29 6 9 house/noun
## 30 6 10 wife/noun
## 31 3 1 new/adj
## 32 3 2 life/noun
## 33 3 3 begins/verb
## 34 3 4 film/noun
## 35 3 5 own/adj
## 36 3 6 become/verb
## 37 3 7 school/noun
## 38 3 8 including/verb
## 39 3 9 story/noun
## 40 3 10 world/noun
## 41 4 1 police/noun
## 42 4 2 local/adj
## 43 4 3 taken/verb
## 44 4 4 murder/noun
## 45 4 5 officer/noun
## 46 4 6 gets/verb
## 47 4 7 arrested/verb
## 48 4 8 arrive/verb
## 49 4 9 prison/noun
## 50 4 10 sent/verb
## [1] "modularity = 0.119871233523078"
detect_communities(token.all, 'female', 10)
## community n_characters most_important
## 1 3 734 killing/verb
## 2 6 673 arrives/verb
## 3 2 630 local/adj
## 4 4 506 night/noun
## 5 1 205 store/noun
## 6 5 2 serial/adj
## 7 5 2 killer/noun
## community rank word
## 1 3 1 car/noun
## 2 3 2 house/noun
## 3 3 3 killed/verb
## 4 3 4 room/noun
## 5 3 5 body/noun
## 6 3 6 killing/verb
## 7 3 7 causing/verb
## 8 3 8 kills/verb
## 9 3 9 dead/adj
## 10 3 10 falls/verb
## 11 6 1 male/characters
## 12 6 2 is/verb
## 13 6 3 be/verb
## 14 6 4 father/noun
## 15 6 5 has/verb
## 16 6 6 man/noun
## 17 6 7 tells/verb
## 18 6 8 family/noun
## 19 6 9 wife/noun
## 20 6 10 mother/noun
## 21 2 1 other/adj
## 22 2 2 men/noun
## 23 2 3 find/verb
## 24 2 4 get/verb
## 25 2 5 police/noun
## 26 2 6 group/noun
## 27 2 7 town/noun
## 28 2 8 are/verb
## 29 2 9 kill/verb
## 30 2 10 help/verb
## 31 4 1 time/noun
## 32 4 2 new/adj
## 33 4 3 life/noun
## 34 4 4 film/noun
## 35 4 5 make/verb
## 36 4 6 begins/verb
## 37 4 7 have/verb
## 38 4 8 own/adj
## 39 4 9 school/noun
## 40 4 10 way/noun
## 41 1 1 money/noun
## 42 1 2 take/verb
## 43 1 3 give/verb
## 44 1 4 job/noun
## 45 1 5 pay/verb
## 46 1 6 offers/verb
## 47 1 7 more/adj
## 48 1 8 company/noun
## 49 1 9 given/verb
## 50 1 10 giving/verb
## [1] "modularity = 0.092771822096592"